library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(readr)
library(tidyr)
library(stringr)
library(ggplot2)
library(fdrtool)
library(corrplot)
## corrplot 0.88 loaded
meta_qc_align <- read_tsv(snakemake@input$meta_qc_align)
##
## ── Column specification ────────────────────────────────────────────────────────────────────────
## cols(
## cohort = col_character(),
## ancestries = col_character(),
## release = col_character(),
## N_cases = col_double(),
## N_controls = col_double(),
## N_snps = col_double(),
## N_snps_aligned = col_double(),
## median_or = col_double(),
## max_OR = col_double(),
## mean_SE = col_double(),
## max_SE = col_double()
## )
meta_samples <-
meta_qc_align %>%
filter(!cohort %in% c('MDD29', 'PGC')) %>%
group_by(cohort) %>%
summarize(Cases=sum(N_cases), Controls=sum(N_controls)) %>%
pivot_longer(Cases:Controls, names_to="MDD", values_to="N") %>%
arrange(desc(N))
cohort_order <-
meta_samples %>%
filter(MDD == 'Cases') %>%
arrange(desc(N)) %>%
pull(cohort)
ggplot(meta_samples, aes(x=0, y=0, color=MDD, size=N)) +
geom_point() +
facet_wrap(~factor(cohort, levels=cohort_order)) +
scale_size_area(max_size=30) +
theme_minimal() +
theme(axis.line=element_blank(),
axis.text.x=element_blank(),
axis.text.y=element_blank(),
axis.ticks=element_blank(),
axis.title.x=element_blank(),
axis.title.y=element_blank())
Marker names were aligned to the meta-analysis reference panel and effect sizes were checked. SNPs with MAF < 0.01 in the imputation reference panel and INFO < 0.1 in the cohort were removed.
meta_qc_align %>% print(n=Inf)
## # A tibble: 32 x 11
## cohort ancestries release N_cases N_controls N_snps N_snps_aligned median_or
## <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 23andMe eas v7_2 2727 90220 5.67e6 4589798 1
## 2 23andMe eur v7_2_2… 112892 1773938 7.50e6 6035449 1
## 3 AGDS eur 202012 12123 12684 7.62e6 7582482 1.00
## 4 Airwave eur 0820 2100 15713 7.49e6 7472104 1.00
## 5 ALSPAC eur 120820… 472 3475 9.05e6 5616108 1.00
## 6 BASIC eur 202011 1003 1854 7.77e6 6718587 0.999
## 7 BioVU eur Cov_SA… 7757 24723 6.26e6 5816204 1.00
## 8 BioVU eur NoCov_… 7757 24723 6.26e6 5816204 1.00
## 9 DBDS eur FINAL2… 13347 145996 7.59e6 6623408 1.00
## 10 deCODE eur DEPALL… 20000 28000 8.84e6 7322173 1
## 11 ESTBB eur EstBB 35473 91301 2.69e7 7603277 1.00
## 12 EXCEED eur 202010 580 2071 8.08e6 5948847 0.999
## 13 FinnGen eur R5_180… 23424 192220 1.64e7 7626331 1.00
## 14 GenScot eur 1215a 997 6358 7.71e6 6371459 1.00
## 15 GERA eur 0915a_… 7162 38307 1.09e7 7855034 1
## 16 HUNT eur gp_hos… 11658 42535 8.69e6 7794399 1.00
## 17 iPSYCH eur 2012_H… 19156 22708 8.81e6 7664074 1
## 18 iPSYCH eur 2015i_… 10002 15434 8.85e6 7683643 1
## 19 lgic2 eur 202011 906 4717 7.76e6 6576356 0.999
## 20 MDD29 eur 0120a_… 16674 25452 9.75e6 7962860 1.00
## 21 MDD49 eur 29w2_2… 28147 48033 7.71e6 7677489 0.998
## 22 MoBa eur harves… 603 10213 6.50e6 5414660 1.00
## 23 MoBa eur harves… 367 6122 6.50e6 4821265 1.00
## 24 MoBa eur rotter… 553 8860 6.50e6 5324929 1.00
## 25 MVP eur 4_0ICD… 151974 226640 6.08e6 5269999 1
## 26 PBK eur 2020 5607 16080 7.35e6 7311190 1
## 27 PGC eur wray20… 135458 344901 1.36e7 7941764 1
## 28 PREFECT eur run1 1796 3290 8.96e6 7529319 0.999
## 29 SHARE eur godart… 1063 1921 1.36e7 6534886 1
## 30 STAGE eur MDDdx_… 421 9134 7.43e6 5248073 0.999
## 31 tkda1 eur run1 672 846 8.86e6 6135331 0.998
## 32 UKBB eur MD_glm… 41500 319630 9.10e6 7396841 1
## # … with 3 more variables: max_OR <dbl>, mean_SE <dbl>, max_SE <dbl>
Median odds-ratios with standard error
ggplot(meta_qc_align %>%
unite('cohort',
cohort, ancestries, release,
sep="."),
aes(x=reorder(cohort, (4*N_cases*N_controls)/(N_cases + N_controls)),
y=median_or,
ymax=exp(log(median_or)+mean_SE),
ymin=exp(log(median_or)-mean_SE))) +
geom_linerange() +
geom_point() +
coord_flip(ylim=c(0.8, 1.25))
Mean standard error versus effective sample size:
ggplot(meta_qc_align %>%
unite('cohort',
cohort, ancestries, release,
sep=".") %>%
mutate(Neff=(4*N_cases*N_controls)/(N_cases + N_controls)),
aes(x=Neff, y=mean_SE, label=cohort)) +
geom_text(check_overlap=TRUE, size=2) +
scale_x_log10(breaks=c(1, 100, 1000, 5000, 10000, 50000, 100000, 500000))
LDSC genetic correlations were calculated with the PGC MDD29 clinical cohorts
meta_qc_ldsc <-
read_table2(snakemake@input$meta_qc_ldsc) %>%
mutate(se.mdd2=as.numeric(str_remove_all(se.mdd2, "[\\(\\)]")),
se.mdd29=as.numeric(str_remove_all(se.mdd29, "[\\(\\)]")))
##
## ── Column specification ────────────────────────────────────────────────────────────────────────
## cols(
## cohort = col_character(),
## release = col_character(),
## rg.mdd2 = col_double(),
## se.mdd2 = col_character(),
## gencov.mdd2 = col_double(),
## rg.mdd29 = col_double(),
## se.mdd29 = col_character(),
## gencov.mdd29 = col_double()
## )
## Warning: 3 parsing failures.
## row col expected actual file
## 4 -- 8 columns 5 columns 'docs/tables/meta_qc_ldsc.txt'
## 5 -- 8 columns 5 columns 'docs/tables/meta_qc_ldsc.txt'
## 20 -- 8 columns 5 columns 'docs/tables/meta_qc_ldsc.txt'
ggplot(meta_qc_ldsc, aes(x=reorder(paste(cohort, release), rg.mdd29), y=rg.mdd29, ymin=rg.mdd29-se.mdd29, ymax=rg.mdd29+se.mdd29)) +
geom_linerange() +
geom_point() +
coord_flip(ylim=c(0, 1.25))
## Warning: Removed 8 rows containing missing values (geom_segment).
## Warning: Removed 8 rows containing missing values (geom_point).
Pairwise LDSC genetic covariance intercepts between all cohorts
meta_qc_ldsc_pairs <- read_tsv(snakemake@input$meta_qc_ldsc_pairs)
##
## ── Column specification ────────────────────────────────────────────────────────────────────────
## cols(
## cohort1 = col_character(),
## subcohort1 = col_character(),
## cohort2 = col_character(),
## subcohort2 = col_character(),
## ancestry1 = col_character(),
## ancestry2 = col_character(),
## rg = col_double(),
## se = col_double(),
## z = col_double(),
## p = col_double(),
## h2_obs = col_double(),
## h2_obs_se = col_double(),
## h2_int = col_double(),
## h2_int_se = col_double(),
## gcov_int = col_double(),
## gcov_int_se = col_double()
## )
# calculate total sample sizes
meta_qc_samplesize <- meta_qc_align %>%
transmute(cohort, ancestries, release, N=N_cases+N_controls)
# sumstats appearing in MDD3 (exclude other
# sumstats like Wray 2018 that have had their
# sumstats munged for other reasons)
meta_qc_ldsc_pairs_mdd3 <- meta_qc_ldsc_pairs %>%
filter(!cohort1 %in% c('MDD29', 'PGC') & !cohort2 %in% c('MDD29', 'PGC')) %>%
left_join(meta_qc_samplesize %>% rename(cohortN1=N), by=c('cohort1'='cohort', 'subcohort1'='release', 'ancestry1'='ancestries')) %>%
left_join(meta_qc_samplesize %>% rename(cohortN2=N), by=c('cohort2'='cohort', 'subcohort2'='release', 'ancestry2'='ancestries'))
Histogram of intercepts
ggplot(meta_qc_ldsc_pairs_mdd3, aes(x=gcov_int)) +
geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 44 rows containing non-finite values (stat_bin).
Pairwise intercepts
Show the largest and smallest intercepts. Exclude Wray2018 sumstats that are also in the mix. Calculate distance from 0 in standard deviation units to get an idea of the magnitude of departure from expectation relative to the other intercepts.
meta_qc_ldsc_pairs_mdd3 %>%
select(cohort1, subcohort1, cohort2, subcohort2, gcov_int) %>%
mutate(SDs=abs(gcov_int)/sd(gcov_int, na.rm=T)) %>%
arrange(desc(gcov_int))
## # A tibble: 378 x 6
## cohort1 subcohort1 cohort2 subcohort2 gcov_int SDs
## <chr> <chr> <chr> <chr> <dbl> <dbl>
## 1 23andMe v7_2_202012 iPSYCH 2012_HRC 0.0219 3.46
## 2 MoBa harvest12 MoBa rotterdam1 0.0213 3.36
## 3 MDD49 29w2_20w3_1504 tkda1 run1 0.0212 3.35
## 4 HUNT gp_hospital_metacarpa_20190625 MoBa harvest12 0.0203 3.21
## 5 HUNT gp_hospital_metacarpa_20190625 iPSYCH 2015i_HRC 0.0165 2.61
## 6 HUNT gp_hospital_metacarpa_20190625 MVP 4_0ICDdep_2021… 0.0165 2.61
## 7 MVP 4_0ICDdep_202106 PBK 2020 0.015 2.37
## 8 AGDS 202012 iPSYCH 2012_HRC 0.0147 2.32
## 9 HUNT gp_hospital_metacarpa_20190625 iPSYCH 2012_HRC 0.0146 2.31
## 10 MDD49 29w2_20w3_1504 PREFECT run1 0.0144 2.27
## # … with 368 more rows
meta_qc_ldsc_pairs_mdd3 %>%
select(cohort1, subcohort1, cohort2, subcohort2, gcov_int) %>%
mutate(SDs=abs(gcov_int)/sd(gcov_int, na.rm=T)) %>%
arrange(gcov_int)
## # A tibble: 378 x 6
## cohort1 subcohort1 cohort2 subcohort2 gcov_int SDs
## <chr> <chr> <chr> <chr> <dbl> <dbl>
## 1 GERA 0915a_mds5 MoBa harvest12 -0.0172 2.72
## 2 MVP 4_0ICDdep_202106 lgic2 202011 -0.0127 2.01
## 3 23andMe v7_2_202012 BASIC 202011 -0.0123 1.94
## 4 23andMe v7_2_202012 Airwave 0820 -0.012 1.90
## 5 SHARE godartsshare_842021 iPSYCH 2012_HRC -0.0116 1.83
## 6 EXCEED 202010 MoBa harvest12 -0.0109 1.72
## 7 GERA 0915a_mds5 iPSYCH 2012_HRC -0.0109 1.72
## 8 23andMe v7_2_202012 BioVU NoCov_SAIGE_051821 -0.0106 1.67
## 9 MoBa harvest12 SHARE godartsshare_842021 -0.0106 1.67
## 10 23andMe v7_2_202012 ALSPAC 12082019 -0.0103 1.63
## # … with 368 more rows
Estimate amount of sample overlap as \(N_S = g_{\mathrm{cov}_\mathrm{int}}\sqrt{N_1N_2} / r_\mathrm{P}\), where \(r_\mathrm{P}\) is the phenotypic correlation between the two MDD phenotype (assume \(r_\mathrm{P} = 1\))
rP <- 1
meta_qc_ldsc_pairs_mdd3_ns <-
meta_qc_ldsc_pairs_mdd3 %>%
mutate(Ns_factor=sqrt(cohortN1*cohortN2)/rP) %>%
mutate(Ns=gcov_int*Ns_factor,
Ns_l95=(gcov_int+qnorm(0.025)*gcov_int_se)*Ns_factor,
Ns_u95=(gcov_int+qnorm(0.975)*gcov_int_se)*Ns_factor,
chisq=gcov_int^2/gcov_int_se^2) %>%
filter(!is.na(chisq)) %>%
mutate(qval=fdrtool(pchisq(chisq, df=1, lower.tail=F), statistic='pvalue', plot=FALSE)$qval) %>%
select(cohort1, subcohort1, cohort2, subcohort2, cohortN1, cohortN2, chisq, qval, Ns, Ns_l95, Ns_u95) %>%
mutate(Ns_pct=100*Ns/(cohortN1+cohortN2))
## Step 1... determine cutoff point
## Step 2... estimate parameters of null distribution and eta0
## Step 3... compute p-values and estimate empirical PDF/CDF
## Step 4... compute q-values and local fdr
Sort (largest-to-smallest) by sample overlap
meta_qc_ldsc_pairs_mdd3_ns %>%
arrange(desc(Ns)) %>%
select(-chisq, -qval, -Ns_pct)
## # A tibble: 334 x 9
## cohort1 subcohort1 cohort2 subcohort2 cohortN1 cohortN2 Ns Ns_l95 Ns_u95
## <chr> <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 23andMe v7_2_202012 iPSYCH 2012_HRC 1886830 41864 6155. 1.03e3 11278.
## 2 23andMe v7_2_202012 ESTBB EstBB 1886830 126774 5478. -1.90e3 12859.
## 3 23andMe v7_2_202012 MVP 4_0ICDdep… 1886830 378614 4818. -1.27e4 22377.
## 4 23andMe v7_2_202012 UKBB MD_glm_20… 1886830 361130 4788. -7.99e3 17569.
## 5 MVP 4_0ICDdep_… UKBB MD_glm_20… 378614 361130 3106. -2.84e3 9049.
## 6 23andMe v7_2_202012 GERA 0915a_mds5 1886830 45469 2519. -1.90e3 6939.
## 7 HUNT gp_hospita… MVP 4_0ICDdep… 54193 378614 2363. 3.70e2 4357.
## 8 FinnGen R5_18032020 MVP 4_0ICDdep… 215644 378614 2229. -1.86e3 6317.
## 9 MDD49 29w2_20w3_… UKBB MD_glm_20… 76180 361130 2189. 4.38e1 4335.
## 10 ESTBB EstBB UKBB MD_glm_20… 126774 361130 1990. 1.03e2 3877.
## # … with 324 more rows
Sort (largest-to-smallest) by percentage of overlap to combined sample size
meta_qc_ldsc_pairs_mdd3_ns %>%
arrange(desc(Ns_pct)) %>%
select(-cohortN1, -cohortN2, -chisq, -qval)
## # A tibble: 334 x 8
## cohort1 subcohort1 cohort2 subcohort2 Ns Ns_l95 Ns_u95 Ns_pct
## <chr> <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 MoBa harvest12 MoBa rotterdam1 215. 102. 328. 1.06
## 2 HUNT gp_hospital_metacarp… iPSYCH 2015i_HRC 613. 220. 1006. 0.769
## 3 HUNT gp_hospital_metacarp… MoBa harvest12 491. 183. 800. 0.756
## 4 HUNT gp_hospital_metacarp… iPSYCH 2012_HRC 695. 88.6 1302. 0.724
## 5 AGDS 202012 iPSYCH 2012_HRC 474. 50.5 897. 0.711
## 6 PREFECT run1 STAGE MDDdx_saige 89.2 9.98 168. 0.609
## 7 STAGE MDDdx_saige lgic2 202011 92.4 16.2 168. 0.608
## 8 iPSYCH 2012_HRC iPSYCH 2015i_HRC 401. 4.84 798. 0.596
## 9 GenScot 1215a PBK 2020 172. 52.9 291. 0.591
## 10 GERA 0915a_mds5 MDD49 29w2_20w3_1… 712. 43.1 1381. 0.585
## # … with 324 more rows
Sort by \(\chi^2\) (Wald) test statistics
meta_qc_ldsc_pairs_mdd3_ns %>%
arrange(desc(chisq)) %>%
select(-cohortN1, -cohortN2)
## # A tibble: 334 x 10
## cohort1 subcohort1 cohort2 subcohort2 chisq qval Ns Ns_l95 Ns_u95
## <chr> <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 MoBa harvest12 MoBa rotterdam1 14.0 0.0381 215. 102. 328.
## 2 MDD49 29w2_20w3_1504 tkda1 run1 13.4 0.0381 228. 106. 350.
## 3 HUNT gp_hospital_me… MoBa harvest12 9.75 0.150 491. 183. 800.
## 4 HUNT gp_hospital_me… iPSYCH 2015i_HRC 9.34 0.166 613. 220. 1006.
## 5 GERA 0915a_mds5 UKBB MD_glm_202… 8.07 0.226 1820. 564. 3075.
## 6 GenScot 1215a PBK 2020 8.03 0.227 172. 52.9 291.
## 7 GERA 0915a_mds5 MoBa harvest12 7.45 0.268 -381. -655. -108.
## 8 STAGE MDDdx_saige lgic2 202011 5.65 0.426 92.4 16.2 168.
## 9 23andMe v7_2_202012 iPSYCH 2012_HRC 5.55 0.434 6155. 1032. 11278.
## 10 HUNT gp_hospital_me… MVP 4_0ICDdep_… 5.40 0.446 2363. 370. 4357.
## # … with 324 more rows, and 1 more variable: Ns_pct <dbl>
Test for heterogeniety in covariance intercepts. Calculate \(w\), the inverse variance of each \(g_\mathrm{covint}\) standard error, then calculate a weighted sum \(\hat{g_\mathrm{covint}}\)
meta_qc_ldsc_pairs_mdd3 %>%
filter(!is.na(gcov_int)) %>%
mutate(w=1/gcov_int_se^2) %>%
mutate(gcov_int_hat=sum(w*gcov_int)/sum(w)) %>%
summarise(Q=sum(w*(gcov_int-gcov_int_hat)^2), k=n()) %>%
mutate(I2=(Q-(k-1))/Q)
## # A tibble: 1 x 3
## Q k I2
## <dbl> <int> <dbl>
## 1 382. 334 0.128
There is thus some heterogeneity in genetic covariance intercepts. We also want to look at heterogeniety per-cohort. The LDSC intercepts were calculated for each unique pair of cohorts. Expand this table to have every ordering of each pair
# get unique cohort names from name and ancestry columns
meta_qc_ldsc_pairs_mdd3_named <-
meta_qc_ldsc_pairs_mdd3 %>%
mutate(cohort1=paste(cohort1, subcohort1, ancestry1, sep='.'),
cohort2=paste(cohort2, subcohort2, ancestry2, sep='.')) %>%
mutate(pair_name=paste(cohort1, cohort2, sep=','))
# get all unique cohort names
cohorts <- unique(c(meta_qc_ldsc_pairs_mdd3_named$cohort1,
meta_qc_ldsc_pairs_mdd3_named$cohort2))
# ordered pairs of cohort we have data listings for
cohorts_pair_names <- unique(meta_qc_ldsc_pairs_mdd3_named$pair_name)
# create list of all pairs of cohorts
meta_qc_ldsc_pairs_mdd3_all <-
tibble(cohort1=cohorts, cohort2=cohorts) %>%
complete(cohort1, cohort2) %>%
filter(cohort1 != cohort2) %>%
mutate(pair_name12=paste(cohort1, cohort2, sep=','),
pair_name21=paste(cohort2, cohort1, sep=',')) %>%
mutate(pair_name=case_when(pair_name12 %in% cohorts_pair_names ~ pair_name12,
pair_name21 %in% cohorts_pair_names ~ pair_name21,
TRUE ~ NA_character_)) %>%
select(cohort1, cohort2, pair_name) %>%
left_join(meta_qc_ldsc_pairs_mdd3_named %>% select(pair_name, gcov_int, gcov_int_se), by='pair_name')
Calculate \(Q\) and \(I^2\) for each cohort
meta_qc_ldsc_pairs_mdd3_all_het <-
meta_qc_ldsc_pairs_mdd3_all %>%
group_by(cohort1) %>%
filter(!is.na(gcov_int)) %>%
mutate(w=1/gcov_int_se^2) %>%
mutate(gcov_int_hat=sum(w*gcov_int)/sum(w)) %>%
summarise(Q=sum(w*(gcov_int-gcov_int_hat)^2), k=n()) %>%
mutate(I2=(Q-(k-1))/Q) %>%
arrange(desc(I2))
meta_qc_ldsc_pairs_mdd3_all_het %>%
as.data.frame()
## cohort1 Q k I2
## 1 MoBa.harvest12.eur 46.54321 24 0.50583550
## 2 GERA.0915a_mds5.eur 42.34190 24 0.45680289
## 3 tkda1.run1.eur 24.89089 16 0.39736981
## 4 iPSYCH.2012_HRC.eur 38.03735 26 0.34275121
## 5 MoBa.rotterdam1.eur 37.60505 26 0.33519566
## 6 BASIC.202011.eur 19.58703 15 0.28524146
## 7 HUNT.gp_hospital_metacarpa_20190625.eur 32.85551 25 0.26952888
## 8 PREFECT.run1.eur 30.54272 24 0.24695631
## 9 STAGE.MDDdx_saige.eur 31.48149 27 0.17411785
## 10 MDD49.29w2_20w3_1504.eur 28.67780 25 0.16311567
## 11 GenScot.1215a.eur 27.35163 25 0.12253850
## 12 ALSPAC.12082019.eur 22.65496 22 0.07305087
## 13 lgic2.202011.eur 25.82344 25 0.07061171
## 14 23andMe.v7_2_202012.eur 27.22148 27 0.04487205
## 15 PBK.2020.eur 23.39237 24 0.01677345
## 16 AGDS.202012.eur 23.29556 25 -0.03023908
## 17 BioVU.NoCov_SAIGE_051821.eur 22.16441 24 -0.03769944
## 18 iPSYCH.2015i_HRC.eur 22.98309 25 -0.04424587
## 19 UKBB.MD_glm_202012.eur 21.66286 25 -0.10788710
## 20 SHARE.godartsshare_842021.eur 19.85713 23 -0.10791446
## 21 MVP.4_0ICDdep_202106.eur 22.36432 27 -0.16256598
## 22 EXCEED.202010.eur 20.26973 25 -0.18403178
## 23 DBDS.FINAL202103.eur 20.34515 26 -0.22879436
## 24 FinnGen.R5_18032020.eur 19.26253 26 -0.29785641
## 25 deCODE.DEPALL_FINAL_WHEAD.eur 17.50199 25 -0.37127287
## 26 ESTBB.EstBB.eur 17.21966 25 -0.39375541
## 27 Airwave.0820.eur 16.91729 26 -0.47777808
## 28 MoBa.harvest24.eur 5.07686 11 -0.96972158
# find range to fit all gcov_int values
gcov_int_min <- min(with(meta_qc_ldsc_pairs_mdd3, gcov_int-gcov_int_se), na.rm=TRUE)
gcov_int_max <- max(with(meta_qc_ldsc_pairs_mdd3, gcov_int+gcov_int_se), na.rm=TRUE)
ggplot(meta_qc_ldsc_pairs_mdd3_all %>% filter(cohort1 %in% meta_qc_ldsc_pairs_mdd3_all_het$cohort1[1:4]),
aes(x=cohort2, y=gcov_int, ymin=gcov_int-gcov_int_se, ymax=gcov_int+gcov_int_se)) +
geom_point() +
geom_linerange() +
facet_grid(cols=vars(cohort1)) +
coord_flip(ylim=c(gcov_int_min-0.01, gcov_int_max+0.01))
## Warning: Removed 18 rows containing missing values (geom_point).
## Warning: Removed 3 rows containing missing values (geom_segment).
## Warning: Removed 1 rows containing missing values (geom_segment).
## Warning: Removed 3 rows containing missing values (geom_segment).
## Warning: Removed 11 rows containing missing values (geom_segment).
ggplot(meta_qc_ldsc_pairs_mdd3_all %>% filter(cohort1 %in% meta_qc_ldsc_pairs_mdd3_all_het$cohort1[5:8]),
aes(x=cohort2, y=gcov_int, ymin=gcov_int-gcov_int_se, ymax=gcov_int+gcov_int_se)) +
geom_point() +
geom_linerange() +
facet_grid(cols=vars(cohort1)) +
coord_flip(ylim=c(gcov_int_min-0.01, gcov_int_max+0.01))
## Warning: Removed 18 rows containing missing values (geom_point).
## Warning: Removed 12 rows containing missing values (geom_segment).
## Warning: Removed 2 rows containing missing values (geom_segment).
## Warning: Removed 1 rows containing missing values (geom_segment).
## Warning: Removed 3 rows containing missing values (geom_segment).
Cluster based on similarity in genetic covariance intercepts.
# get a list of all cohort/subcohort names
subcohorts <-
bind_rows(
select(meta_qc_ldsc_pairs_mdd3, cohort=cohort1, subcohort=subcohort1),
select(meta_qc_ldsc_pairs_mdd3, cohort=cohort2, subcohort=subcohort2)
) %>%
distinct()
Make a matrix:
cohort_names <-
subcohorts %>%
transmute(cohort=paste(cohort, subcohort, sep='.')) %>%
pull(cohort)
gcov_int_mat <- diag(length(cohort_names))
dimnames(gcov_int_mat) <- list(cohort_names, cohort_names)
for(i in seq.int(nrow(meta_qc_ldsc_pairs_mdd3))) {
cohort1 <- meta_qc_ldsc_pairs_mdd3$cohort1[i]
cohort2 <- meta_qc_ldsc_pairs_mdd3$cohort2[i]
subcohort1 <- meta_qc_ldsc_pairs_mdd3$subcohort1[i]
subcohort2 <- meta_qc_ldsc_pairs_mdd3$subcohort2[i]
cohort_name1 <- paste(cohort1, subcohort1, sep='.')
cohort_name2 <- paste(cohort2, subcohort2, sep='.')
# if gcov_int is NA, substitute 0 (average)
gcov_int <- coalesce(meta_qc_ldsc_pairs_mdd3$gcov_int[i], 0)
gcov_int_mat[cohort_name1,cohort_name2] <- gcov_int
gcov_int_mat[cohort_name2,cohort_name1] <- gcov_int
}
Convert intercepts into a dissimilarity matrix. For intercepts, 1 == same, 0 == average, <0 == different. For dissimilarity, 0 == same and larger values == more dissimilar.
plot(hclust(as.dist(1-gcov_int_mat)))
corrplot(gcov_int_mat, is.corr=FALSE, diag=FALSE)
rg_mat <- diag(length(cohort_names))
dimnames(rg_mat) <- list(cohort_names, cohort_names)
for(i in seq.int(nrow(meta_qc_ldsc_pairs_mdd3))) {
cohort1 <- meta_qc_ldsc_pairs_mdd3$cohort1[i]
cohort2 <- meta_qc_ldsc_pairs_mdd3$cohort2[i]
subcohort1 <- meta_qc_ldsc_pairs_mdd3$subcohort1[i]
subcohort2 <- meta_qc_ldsc_pairs_mdd3$subcohort2[i]
cohort_name1 <- paste(cohort1, subcohort1, sep='.')
cohort_name2 <- paste(cohort2, subcohort2, sep='.')
# if rg is NA, substitute 0 (average)
rg <- meta_qc_ldsc_pairs_mdd3$rg[i]
rg_mat[cohort_name1,cohort_name2] <- rg
rg_mat[cohort_name2,cohort_name1] <- rg
}
rg_mat_1 <- rg_mat
rg_mat_1[which(rg_mat_1 > 1)] <- 1
rg_mat_1[which(rg_mat_1 < -1)] <- -1
corrplot(rg_mat_1, na.label='.')